{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrating sklearn Modin Interoperability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logistic Regression example taken / adapted from https://www.ritchieng.com/pandas-scikit-learn/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import modin.pandas as pd\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://www.ritchieng.com/pandas-scikit-learn/\n",
    "\n",
    "url = 'http://bit.ly/kaggletrain'\n",
    "train = pd.read_csv(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pclass: passenger class\n",
    "# Parch: parents and children\n",
    "feature_cols = ['Pclass', 'Parch']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# you want all rows, and the feature_cols' columns\n",
    "X = train.loc[:, feature_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now we want to create our response vector\n",
    "y = train.Survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. import\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# 2. instantiate model\n",
    "logreg = LogisticRegression()\n",
    "\n",
    "# 3. fit \n",
    "logreg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_test = 'http://bit.ly/kaggletest'\n",
    "test = pd.read_csv(url_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# missing Survived column because we are predicting\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = test.loc[:, feature_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. predict\n",
    "new_pred_class = logreg.predict(X_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# kaggle wants 2 columns\n",
    "# new_pred_class\n",
    "# PassengerId\n",
    "\n",
    "# pandas would align them next to each other\n",
    "# to ensure the first column is PassengerId, use .set_index\n",
    "kaggle_data = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId')\n",
    "kaggle_data.to_csv('sub.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save train data to disk using pickle\n",
    "train.to_pickle('train.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read data\n",
    "pd.read_pickle('train.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import Normalizer\n",
    "ct = ColumnTransformer(\n",
    "    [(\"norm1\", Normalizer(norm='l1'), [0, 1]),\n",
    "     (\"norm2\", Normalizer(norm='l1'), slice(2, 4))])\n",
    "X = pd.DataFrame(np.array([[0., 1., 2., 2.],\n",
    "              [1., 1., 0., 1.]]))\n",
    "# Normalizer scales each row of X to unit norm. A separate scaling\n",
    "# is applied for the two first and two last elements of each\n",
    "# row independently.\n",
    "ct.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction import FeatureHasher\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "X = pd.DataFrame({\n",
    "    \"documents\": [\"First item\", \"second one here\", \"Is this the last?\"],\n",
    "    \"width\": [3, 4, 5],\n",
    "})  \n",
    "ct = ColumnTransformer(\n",
    "    [(\"text_preprocess\", FeatureHasher(input_type=\"string\"), \"documents\"),\n",
    "     (\"num_preprocess\", MinMaxScaler(), [\"width\"])])\n",
    "X_trans = ct.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.impute import SimpleImputer\n",
    "imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
    "imp_mean.fit(pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]))\n",
    "\n",
    "X = pd.DataFrame([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]])\n",
    "print(imp_mean.transform(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "X, y = pd.DataFrame(np.arange(10).reshape((5, 2))), pd.Series(range(5))\n",
    "X\n",
    "list(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_test_split(y, shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Linear Regression example taken / adapted from https://github.com/chendaniely/2021-07-13-scipy-pandas/blob/main/05-models.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips = sns.load_dataset(\"tips\")\n",
    "tips = pd.DataFrame(tips)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.get_dummies(tips, drop_first=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import linear_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. create the model object\n",
    "lr = linear_model.LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. fit the model object\n",
    "lr.fit(X=tips[[\"total_bill\", \"size\"]], y=tips[\"tip\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look at the coefficients\n",
    "lr.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look at the intercept\n",
    "lr.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips_dummy = pd.get_dummies(tips, drop_first=True)[[\"tip\", \"total_bill\", \"smoker_No\"]]\n",
    "tips_dummy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr2 = linear_model.LinearRegression()\n",
    "lr2.fit(X=tips_dummy.iloc[:, 1:], y=tips_dummy[\"tip\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr2.coef_, lr2.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_data = tips_dummy[[\"total_bill\", \"smoker_No\"]].tail() # not really new data\n",
    "new_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# use the model to give predicted tip values\n",
    "new_data[\"predicted_tips\"] = lr2.predict(new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(new_data)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
